{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pandas导入数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>email</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>jack@example.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>mary@example.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>lily@example.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>tom@example.com</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              email\n",
       "0  jack@example.com\n",
       "1  mary@example.com\n",
       "2  lily@example.com\n",
       "3   tom@example.com"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_table('./统计于分析模块Pandas/01.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>name</th>\n",
       "      <th>age</th>\n",
       "      <th>email</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>jack</td>\n",
       "      <td>18</td>\n",
       "      <td>jack@example.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>mary</td>\n",
       "      <td>19</td>\n",
       "      <td>mary@example.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>lily</td>\n",
       "      <td>17</td>\n",
       "      <td>lily@example.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>tom</td>\n",
       "      <td>17</td>\n",
       "      <td>tom@example.com</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>joe</td>\n",
       "      <td>20</td>\n",
       "      <td>joe@example.com</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   name  age             email\n",
       "0  jack   18  jack@example.com\n",
       "1  mary   19  mary@example.com\n",
       "2  lily   17  lily@example.com\n",
       "3   tom   17   tom@example.com\n",
       "4   joe   20   joe@example.com"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_table('./统计于分析模块Pandas/02.txt') # 通过tab键自动进行列分割(即四个空格)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>naame</th>\n",
       "      <th>password</th>\n",
       "      <th>uid</th>\n",
       "      <th>gid</th>\n",
       "      <th>local</th>\n",
       "      <th>home</th>\n",
       "      <th>shell</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>root</td>\n",
       "      <td>x</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>root</td>\n",
       "      <td>/root</td>\n",
       "      <td>/bin/bash</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>daemon</td>\n",
       "      <td>x</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>daemon</td>\n",
       "      <td>/usr/sbin</td>\n",
       "      <td>/usr/sbin/nologin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>bin</td>\n",
       "      <td>x</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>bin</td>\n",
       "      <td>/bin</td>\n",
       "      <td>/usr/sbin/nologin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>sys</td>\n",
       "      <td>x</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>sys</td>\n",
       "      <td>/dev</td>\n",
       "      <td>/usr/sbin/nologin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>sync</td>\n",
       "      <td>x</td>\n",
       "      <td>4</td>\n",
       "      <td>65534</td>\n",
       "      <td>sync</td>\n",
       "      <td>/bin</td>\n",
       "      <td>/bin/sync</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>games</td>\n",
       "      <td>x</td>\n",
       "      <td>5</td>\n",
       "      <td>60</td>\n",
       "      <td>games</td>\n",
       "      <td>/usr/games</td>\n",
       "      <td>/usr/sbin/nologin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>man</td>\n",
       "      <td>x</td>\n",
       "      <td>6</td>\n",
       "      <td>12</td>\n",
       "      <td>man</td>\n",
       "      <td>/var/cache/man</td>\n",
       "      <td>/usr/sbin/nologin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>lp</td>\n",
       "      <td>x</td>\n",
       "      <td>7</td>\n",
       "      <td>7</td>\n",
       "      <td>lp</td>\n",
       "      <td>/var/spool/lpd</td>\n",
       "      <td>/usr/sbin/nologin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>mail</td>\n",
       "      <td>x</td>\n",
       "      <td>8</td>\n",
       "      <td>8</td>\n",
       "      <td>mail</td>\n",
       "      <td>/var/mail</td>\n",
       "      <td>/usr/sbin/nologin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>news</td>\n",
       "      <td>x</td>\n",
       "      <td>9</td>\n",
       "      <td>9</td>\n",
       "      <td>news</td>\n",
       "      <td>/var/spool/news</td>\n",
       "      <td>/usr/sbin/nologin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>uucp</td>\n",
       "      <td>x</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "      <td>uucp</td>\n",
       "      <td>/var/spool/uucp</td>\n",
       "      <td>/usr/sbin/nologin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>proxy</td>\n",
       "      <td>x</td>\n",
       "      <td>13</td>\n",
       "      <td>13</td>\n",
       "      <td>proxy</td>\n",
       "      <td>/bin</td>\n",
       "      <td>/usr/sbin/nologin</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     naame password  uid    gid   local             home              shell\n",
       "0     root        x    0      0    root            /root          /bin/bash\n",
       "1   daemon        x    1      1  daemon        /usr/sbin  /usr/sbin/nologin\n",
       "2      bin        x    2      2     bin             /bin  /usr/sbin/nologin\n",
       "3      sys        x    3      3     sys             /dev  /usr/sbin/nologin\n",
       "4     sync        x    4  65534    sync             /bin          /bin/sync\n",
       "5    games        x    5     60   games       /usr/games  /usr/sbin/nologin\n",
       "6      man        x    6     12     man   /var/cache/man  /usr/sbin/nologin\n",
       "7       lp        x    7      7      lp   /var/spool/lpd  /usr/sbin/nologin\n",
       "8     mail        x    8      8    mail        /var/mail  /usr/sbin/nologin\n",
       "9     news        x    9      9    news  /var/spool/news  /usr/sbin/nologin\n",
       "10    uucp        x   10     10    uucp  /var/spool/uucp  /usr/sbin/nologin\n",
       "11   proxy        x   13     13   proxy             /bin  /usr/sbin/nologin"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " # sep 定义用什么进行分割 header = None表示不将第一条数据作为列名,names 定义表头\n",
    "pd.read_table('./统计于分析模块Pandas/03.txt', sep=':' ,header = None ,names = ['naame','password','uid','gid','local','home','shell'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>书名</th>\n",
       "      <th>单价</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>我在未来等你</td>\n",
       "      <td>29.9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>原则</td>\n",
       "      <td>63.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>半小时漫画世界史</td>\n",
       "      <td>28.9</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         书名    单价\n",
       "0    我在未来等你  29.9\n",
       "1        原则  63.0\n",
       "2  半小时漫画世界史  28.9"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# csv 解决了各种数据库数据交互存在的兼容问题，无论何种数据库都能导出CSV文件，CSV文件都能导入各种数据库生成数据表\n",
    "pd.read_csv('./统计于分析模块Pandas/04.csv') "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>电影名称</th>\n",
       "      <th>上映时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>神秘巨星</td>\n",
       "      <td>2018/1/19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>移动迷宫3</td>\n",
       "      <td>2018/1/26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>阿凡达2</td>\n",
       "      <td>2018/12/25</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    电影名称        上映时间\n",
       "0   神秘巨星   2018/1/19\n",
       "1  移动迷宫3   2018/1/26\n",
       "2   阿凡达2  2018/12/25"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_excel('./统计于分析模块Pandas/05.xlsx')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[   排名  城市    工资\n",
       " 0   1  北京  9240\n",
       " 1   2  上海  8962\n",
       " 2   3  深圳  8315\n",
       " 3   4  广州  7409\n",
       " 4   1  杭州  7330,     0   1   2\n",
       " 0  aa  bb  cc\n",
       " 1  11  22  33\n",
       " 2  44  55  66]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.read_html('./统计于分析模块Pandas/06.html')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>排名</th>\n",
       "      <th>城市</th>\n",
       "      <th>工资</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>北京</td>\n",
       "      <td>9240</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>上海</td>\n",
       "      <td>8962</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>深圳</td>\n",
       "      <td>8315</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>广州</td>\n",
       "      <td>7409</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>杭州</td>\n",
       "      <td>7330</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   排名  城市    工资\n",
       "0   1  北京  9240\n",
       "1   2  上海  8962\n",
       "2   3  深圳  8315\n",
       "3   4  广州  7409\n",
       "4   1  杭州  7330"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tables = pd.read_html('./统计于分析模块Pandas/06.html')\n",
    "tables[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>aa</th>\n",
       "      <th>bb</th>\n",
       "      <th>cc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "      <td>22</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>44</td>\n",
       "      <td>55</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   aa  bb  cc\n",
       "0  11  22  33\n",
       "1  44  55  66"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tables = pd.read_html('./统计于分析模块Pandas/06.html',header = 0,attrs = {'class':'mydata2'}) #取class为mydata2的table标签\n",
    "tables[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pymysql"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "con = pymysql.connect(host = 'localhost',user = 'root',password = '123456',database = 'doubandb',charset = 'utf8',use_unicode = True)\n",
    "pd_sql = 'select * from books2'\n",
    "df = pd.read_sql(pd_sql,con)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>title</th>\n",
       "      <th>author</th>\n",
       "      <th>press</th>\n",
       "      <th>original</th>\n",
       "      <th>translator</th>\n",
       "      <th>imprint</th>\n",
       "      <th>pages</th>\n",
       "      <th>price</th>\n",
       "      <th>binding</th>\n",
       "      <th>series</th>\n",
       "      <th>isbn</th>\n",
       "      <th>score</th>\n",
       "      <th>number</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1000019</td>\n",
       "      <td>政治无意识</td>\n",
       "      <td>弗雷德里克.詹姆逊</td>\n",
       "      <td>中国社会科学出版社</td>\n",
       "      <td></td>\n",
       "      <td>王逢振/陈永国</td>\n",
       "      <td>1999-8</td>\n",
       "      <td>297</td>\n",
       "      <td>35.00元</td>\n",
       "      <td>平装</td>\n",
       "      <td>知识分子图书馆</td>\n",
       "      <td>9787500425564</td>\n",
       "      <td>7.5</td>\n",
       "      <td>107</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1000034</td>\n",
       "      <td>生死遗言</td>\n",
       "      <td>伊能静</td>\n",
       "      <td>现代出版社</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>2002-10</td>\n",
       "      <td>203</td>\n",
       "      <td>18.00元</td>\n",
       "      <td>平装</td>\n",
       "      <td></td>\n",
       "      <td>9787800288494</td>\n",
       "      <td>7.4</td>\n",
       "      <td>2377</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        ID  title     author      press original translator  imprint pages  \\\n",
       "0  1000019  政治无意识  弗雷德里克.詹姆逊  中国社会科学出版社             王逢振/陈永国   1999-8   297   \n",
       "1  1000034   生死遗言        伊能静      现代出版社                      2002-10   203   \n",
       "\n",
       "    price binding   series           isbn  score number  \n",
       "0  35.00元      平装  知识分子图书馆  9787500425564    7.5    107  \n",
       "1  18.00元      平装           9787800288494    7.4   2377  "
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## sort 排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    C\n",
       "3    A\n",
       "2    D\n",
       "dtype: object"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "st = pd.Series(list('CAD'),index=[1,3,2])\n",
    "st"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    C\n",
       "2    D\n",
       "3    A\n",
       "dtype: object"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "st.sort_index() # 索引排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3    A\n",
       "1    C\n",
       "2    D\n",
       "dtype: object"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "st.sort_values() # 值排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3    A\n",
       "2    D\n",
       "1    C\n",
       "dtype: object"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "st.sort_index(ascending = False) # 倒序排序(ascending = False,默认值为True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### DataFrame的排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[[4, 7, 5], [6, 3, 2], [4, 1, 8]]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "arr = [[4,7,5],\n",
    "      [6,3,2],\n",
    "       [4,1,8]\n",
    "      ]\n",
    "arr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>b</th>\n",
       "      <th>a</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   b  a  d\n",
       "1  4  7  5\n",
       "3  6  3  2\n",
       "2  4  1  8"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(arr,index=[1,3,2],columns = list('bad'))\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>b</th>\n",
       "      <th>a</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   b  a  d\n",
       "1  4  7  5\n",
       "2  4  1  8\n",
       "3  6  3  2"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sort_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   a  b  d\n",
       "1  7  4  5\n",
       "3  3  6  2\n",
       "2  1  4  8"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sort_index(axis = 1) # 根据表头排序"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>d</th>\n",
       "      <th>b</th>\n",
       "      <th>a</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>8</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   d  b  a\n",
       "1  5  4  7\n",
       "3  2  6  3\n",
       "2  8  4  1"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.sort_index(axis = 1,ascending = False) # 倒序排序(ascending = False,默认为False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>b</th>\n",
       "      <th>a</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   b  a  d\n",
       "1  4  7  5\n",
       "2  4  1  8\n",
       "3  6  3  2"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 根据某一列进行排序\n",
    "df.sort_values(by = 'b')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>b</th>\n",
       "      <th>a</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   b  a  d\n",
       "2  4  1  8\n",
       "1  4  7  5\n",
       "3  6  3  2"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 根据多列进行排序\n",
    "df.sort_values(by = ['b','a'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## rank 排名"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>b</th>\n",
       "      <th>a</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1.5</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     b    a    d\n",
       "1  1.5  3.0  2.0\n",
       "3  3.0  2.0  1.0\n",
       "2  1.5  1.0  3.0"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.rank()# 对每一列的值进行排名，值越小排名越靠前，相同的去平均值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>b</th>\n",
       "      <th>a</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     b    a    d\n",
       "1  1.0  3.0  2.0\n",
       "3  3.0  2.0  1.0\n",
       "2  2.0  1.0  3.0"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.rank(method = 'first')# 如果值相等，则越靠前的(物理索引的值越小)则排名越靠前"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>b</th>\n",
       "      <th>a</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     b    a    d\n",
       "1  2.0  3.0  2.0\n",
       "3  3.0  2.0  1.0\n",
       "2  2.0  1.0  3.0"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.rank(method='max')# 如果值相等,则排名都后退一位(例如同一列的有两个值都是最小，则都都排第二)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>b</th>\n",
       "      <th>a</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     b    a    d\n",
       "1  1.0  3.0  2.0\n",
       "3  3.0  2.0  1.0\n",
       "2  1.0  1.0  3.0"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.rank(method='min')# 如果值相等,则排名相同，其他值的排名后退一位"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## merge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stu_no</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>s1</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>s2</td>\n",
       "      <td>89</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>s3</td>\n",
       "      <td>81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>s4</td>\n",
       "      <td>94</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  stu_no  score\n",
       "0     s1     97\n",
       "1     s2     89\n",
       "2     s3     81\n",
       "3     s4     94"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = pd.DataFrame({\n",
    "    'stu_no':['s1','s2','s3','s4'],\n",
    "    'score':np.random.randint(50,100,size=4)\n",
    "})\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stu_no</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>s1</td>\n",
       "      <td>张三</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>s2</td>\n",
       "      <td>李四</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>s3</td>\n",
       "      <td>王五</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>s5</td>\n",
       "      <td>赵六</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  stu_no name\n",
       "0     s1   张三\n",
       "1     s2   李四\n",
       "2     s3   王五\n",
       "3     s5   赵六"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.DataFrame({\n",
    "    'stu_no':['s1','s2','s3','s5'],\n",
    "    'name':['张三','李四','王五','赵六']\n",
    "})\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stu_no</th>\n",
       "      <th>score</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>s1</td>\n",
       "      <td>97</td>\n",
       "      <td>张三</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>s2</td>\n",
       "      <td>89</td>\n",
       "      <td>李四</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>s3</td>\n",
       "      <td>81</td>\n",
       "      <td>王五</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  stu_no  score name\n",
       "0     s1     97   张三\n",
       "1     s2     89   李四\n",
       "2     s3     81   王五"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(df1,df2,on='stu_no') # 通过stu_no字段进行连接两张表，内连接inner"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stu_no</th>\n",
       "      <th>score</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>s1</td>\n",
       "      <td>97</td>\n",
       "      <td>张三</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>s2</td>\n",
       "      <td>89</td>\n",
       "      <td>李四</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>s3</td>\n",
       "      <td>81</td>\n",
       "      <td>王五</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>s4</td>\n",
       "      <td>94</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  stu_no  score name\n",
       "0     s1     97   张三\n",
       "1     s2     89   李四\n",
       "2     s3     81   王五\n",
       "3     s4     94  NaN"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(df1,df2,on='stu_no',how='left') #左连接，以左边的表为准，如果右边没有对应的数据则为空"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stu_no</th>\n",
       "      <th>score</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>s1</td>\n",
       "      <td>97.0</td>\n",
       "      <td>张三</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>s2</td>\n",
       "      <td>89.0</td>\n",
       "      <td>李四</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>s3</td>\n",
       "      <td>81.0</td>\n",
       "      <td>王五</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>s5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>赵六</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  stu_no  score name\n",
       "0     s1   97.0   张三\n",
       "1     s2   89.0   李四\n",
       "2     s3   81.0   王五\n",
       "3     s5    NaN   赵六"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(df1,df2,on='stu_no',how='right') # 右连接，以右边的表为准，如果左边没有对应的数据则为空"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>stu_no</th>\n",
       "      <th>score</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>s1</td>\n",
       "      <td>97.0</td>\n",
       "      <td>张三</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>s2</td>\n",
       "      <td>89.0</td>\n",
       "      <td>李四</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>s3</td>\n",
       "      <td>81.0</td>\n",
       "      <td>王五</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>s4</td>\n",
       "      <td>94.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>s5</td>\n",
       "      <td>NaN</td>\n",
       "      <td>赵六</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  stu_no  score name\n",
       "0     s1   97.0   张三\n",
       "1     s2   89.0   李四\n",
       "2     s3   81.0   王五\n",
       "3     s4   94.0  NaN\n",
       "4     s5    NaN   赵六"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(df1,df2,on='stu_no',how='outer') # 属于outer join(外连接，显示所有的字段)连接"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## cancat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0  1\n",
       "0  1  2\n",
       "1  3  4"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df1 = pd.DataFrame(np.arange(1,5).reshape(2,-1))\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1\n",
       "0  0.0  0.0\n",
       "1  0.0  0.0"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df2 = pd.DataFrame(np.zeros((2,2)))\n",
    "df2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1\n",
       "0  1.0  2.0\n",
       "1  3.0  4.0\n",
       "0  0.0  0.0\n",
       "1  0.0  0.0"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([df1,df2]) # 列的合并"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0  1    0    1\n",
       "0  1  2  0.0  0.0\n",
       "1  3  4  0.0  0.0"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([df1,df2],axis = 1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 异常值的分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>8.0</td>\n",
       "      <td>9.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>4.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     a    b\n",
       "0  1.0  NaN\n",
       "1  4.0  5.0\n",
       "2  NaN  NaN\n",
       "3  8.0  9.0\n",
       "4  4.0  5.0"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = [\n",
    "    [1,None],\n",
    "    [4,5],\n",
    "    [None,None],\n",
    "    [8,9],\n",
    "    [4,5],\n",
    "]\n",
    "df = pd.DataFrame(data,columns=['a','b'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     a    b\n",
       "0  1.0  NaN\n",
       "1  4.0  5.0"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>8.0</td>\n",
       "      <td>9.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>4.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     a    b\n",
       "2  NaN  NaN\n",
       "3  8.0  9.0\n",
       "4  4.0  5.0"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.tail(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5 entries, 0 to 4\n",
      "Data columns (total 2 columns):\n",
      "a    4 non-null float64\n",
      "b    3 non-null float64\n",
      "dtypes: float64(2)\n",
      "memory usage: 208.0 bytes\n"
     ]
    }
   ],
   "source": [
    "df.info() # 查看表的属性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    2.872281\n",
       "b    2.309401\n",
       "dtype: float64"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.std()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>count</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>3.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>mean</td>\n",
       "      <td>4.250000</td>\n",
       "      <td>6.333333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>std</td>\n",
       "      <td>2.872281</td>\n",
       "      <td>2.309401</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>min</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25%</td>\n",
       "      <td>3.250000</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>50%</td>\n",
       "      <td>4.000000</td>\n",
       "      <td>5.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>75%</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>7.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>max</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>9.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              a         b\n",
       "count  4.000000  3.000000\n",
       "mean   4.250000  6.333333\n",
       "std    2.872281  2.309401\n",
       "min    1.000000  5.000000\n",
       "25%    3.250000  5.000000\n",
       "50%    4.000000  5.000000\n",
       "75%    5.000000  7.000000\n",
       "max    8.000000  9.000000"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    2.872281\n",
       "b    2.309401\n",
       "dtype: float64"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.std() #标准差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    4\n",
       "b    3\n",
       "dtype: int64"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.count() #获得一列有多少数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    4.250000\n",
       "b    6.333333\n",
       "dtype: float64"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.mean() # 获得每一列的平均值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    8.250000\n",
       "b    5.333333\n",
       "dtype: float64"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.var() #获得方差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>日期</th>\n",
       "      <th>名称</th>\n",
       "      <th>类别</th>\n",
       "      <th>单价</th>\n",
       "      <th>数量</th>\n",
       "      <th>金额</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>2018-07-01</td>\n",
       "      <td>商品A</td>\n",
       "      <td>服装</td>\n",
       "      <td>20</td>\n",
       "      <td>2</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2018-07-02</td>\n",
       "      <td>商品B</td>\n",
       "      <td>服装</td>\n",
       "      <td>200</td>\n",
       "      <td>3</td>\n",
       "      <td>600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2018-07-03</td>\n",
       "      <td>商品C</td>\n",
       "      <td>食品</td>\n",
       "      <td>1200</td>\n",
       "      <td>4</td>\n",
       "      <td>4800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>2018-07-04</td>\n",
       "      <td>商品A</td>\n",
       "      <td>服装</td>\n",
       "      <td>22</td>\n",
       "      <td>5</td>\n",
       "      <td>110</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>2018-07-05</td>\n",
       "      <td>商品B</td>\n",
       "      <td>服装</td>\n",
       "      <td>220</td>\n",
       "      <td>6</td>\n",
       "      <td>1320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>2018-07-06</td>\n",
       "      <td>商品C</td>\n",
       "      <td>食品</td>\n",
       "      <td>1000</td>\n",
       "      <td>7</td>\n",
       "      <td>7000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>2018-07-07</td>\n",
       "      <td>商品A</td>\n",
       "      <td>服装</td>\n",
       "      <td>30</td>\n",
       "      <td>3</td>\n",
       "      <td>90</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>2018-07-08</td>\n",
       "      <td>商品A</td>\n",
       "      <td>服装</td>\n",
       "      <td>800</td>\n",
       "      <td>1</td>\n",
       "      <td>800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>2018-07-09</td>\n",
       "      <td>商品C</td>\n",
       "      <td>食品</td>\n",
       "      <td>1300</td>\n",
       "      <td>4</td>\n",
       "      <td>5200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>2018-07-10</td>\n",
       "      <td>商品B</td>\n",
       "      <td>服装</td>\n",
       "      <td>230</td>\n",
       "      <td>3</td>\n",
       "      <td>690</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>2018-07-11</td>\n",
       "      <td>商品A</td>\n",
       "      <td>服装</td>\n",
       "      <td>28</td>\n",
       "      <td>1</td>\n",
       "      <td>28</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           日期   名称  类别    单价  数量    金额\n",
       "0  2018-07-01  商品A  服装    20   2    40\n",
       "1  2018-07-02  商品B  服装   200   3   600\n",
       "2  2018-07-03  商品C  食品  1200   4  4800\n",
       "3  2018-07-04  商品A  服装    22   5   110\n",
       "4  2018-07-05  商品B  服装   220   6  1320\n",
       "5  2018-07-06  商品C  食品  1000   7  7000\n",
       "6  2018-07-07  商品A  服装    30   3    90\n",
       "7  2018-07-08  商品A  服装   800   1   800\n",
       "8  2018-07-09  商品C  食品  1300   4  5200\n",
       "9  2018-07-10  商品B  服装   230   3   690\n",
       "10 2018-07-11  商品A  服装    28   1    28"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_excel('./pandas中的数据分组与透视表/data.xlsx')\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "grouped = df.groupby('类别')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "服装\n",
      "           日期   名称  类别   单价  数量    金额\n",
      "0  2018-07-01  商品A  服装   20   2    40\n",
      "1  2018-07-02  商品B  服装  200   3   600\n",
      "3  2018-07-04  商品A  服装   22   5   110\n",
      "4  2018-07-05  商品B  服装  220   6  1320\n",
      "6  2018-07-07  商品A  服装   30   3    90\n",
      "7  2018-07-08  商品A  服装  800   1   800\n",
      "9  2018-07-10  商品B  服装  230   3   690\n",
      "10 2018-07-11  商品A  服装   28   1    28\n",
      "食品\n",
      "          日期   名称  类别    单价  数量    金额\n",
      "2 2018-07-03  商品C  食品  1200   4  4800\n",
      "5 2018-07-06  商品C  食品  1000   7  7000\n",
      "8 2018-07-09  商品C  食品  1300   4  5200\n"
     ]
    }
   ],
   "source": [
    "for name,data  in grouped:\n",
    "    print(name)\n",
    "    print(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>数量</th>\n",
       "      <th>金额</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>类别</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>服装</td>\n",
       "      <td>24</td>\n",
       "      <td>3678</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>食品</td>\n",
       "      <td>15</td>\n",
       "      <td>17000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    数量     金额\n",
       "类别           \n",
       "服装  24   3678\n",
       "食品  15  17000"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped['数量','金额'].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "类别  名称 \n",
       "服装  商品A     800\n",
       "    商品B     230\n",
       "食品  商品C    1300\n",
       "Name: 单价, dtype: int64"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped = df.groupby(['类别','名称'])\n",
    "grouped['单价'].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>单价</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>类别</th>\n",
       "      <th>名称</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td rowspan=\"2\" valign=\"top\">服装</td>\n",
       "      <td>商品A</td>\n",
       "      <td>800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>商品B</td>\n",
       "      <td>230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>食品</td>\n",
       "      <td>商品C</td>\n",
       "      <td>1300</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          单价\n",
       "类别 名称       \n",
       "服装 商品A   800\n",
       "   商品B   230\n",
       "食品 商品C  1300"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grouped[['单价']].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "服装\n",
      "['商品A' '商品B']\n",
      "食品\n",
      "['商品C']\n"
     ]
    }
   ],
   "source": [
    "grouped = df.groupby('类别')\n",
    "for name,data  in grouped:\n",
    "    print(name)\n",
    "    print(data['名称'].unique())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## pandas事件序列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',\n",
       "               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',\n",
       "               '2020-01-09', '2020-01-10', '2020-01-11', '2020-01-12',\n",
       "               '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-16',\n",
       "               '2020-01-17', '2020-01-18', '2020-01-19', '2020-01-20',\n",
       "               '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-24',\n",
       "               '2020-01-25', '2020-01-26', '2020-01-27', '2020-01-28',\n",
       "               '2020-01-29', '2020-01-30', '2020-01-31', '2020-02-01',\n",
       "               '2020-02-02', '2020-02-03', '2020-02-04', '2020-02-05',\n",
       "               '2020-02-06', '2020-02-07', '2020-02-08', '2020-02-09',\n",
       "               '2020-02-10', '2020-02-11', '2020-02-12', '2020-02-13',\n",
       "               '2020-02-14', '2020-02-15', '2020-02-16', '2020-02-17',\n",
       "               '2020-02-18', '2020-02-19', '2020-02-20', '2020-02-21',\n",
       "               '2020-02-22', '2020-02-23', '2020-02-24', '2020-02-25',\n",
       "               '2020-02-26', '2020-02-27', '2020-02-28', '2020-02-29',\n",
       "               '2020-03-01'],\n",
       "              dtype='datetime64[ns]', freq='D')"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 初始化事件序列\n",
    "pd.date_range('2020-1-1','2020-3-1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatetimeIndex(['2020-01-05', '2020-01-12', '2020-01-19', '2020-01-26',\n",
       "               '2020-02-02', '2020-02-09', '2020-02-16', '2020-02-23',\n",
       "               '2020-03-01'],\n",
       "              dtype='datetime64[ns]', freq='W-SUN')"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.date_range('2020-1-1','2020-3-1',freq='w') # 以周围单位"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "# D天 Q季度 M月 w周 H时 T分 S秒"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatetimeIndex(['2020-03-31', '2020-06-30', '2020-09-30', '2020-12-31',\n",
       "               '2021-03-31', '2021-06-30', '2021-09-30', '2021-12-31',\n",
       "               '2022-03-31', '2022-06-30'],\n",
       "              dtype='datetime64[ns]', freq='Q-DEC')"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.date_range('2020-01-01',freq = 'Q',periods = 10) # 从2020-01-01开始，以季度为单位，产生10个日期"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>time</th>\n",
       "      <th>cpu</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>2020-01-01 00:00:00</td>\n",
       "      <td>9.226157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>2020-01-01 00:01:00</td>\n",
       "      <td>10.342742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>2020-01-01 00:02:00</td>\n",
       "      <td>9.913512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>2020-01-01 00:03:00</td>\n",
       "      <td>9.370216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>2020-01-01 00:04:00</td>\n",
       "      <td>11.107313</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>199995</td>\n",
       "      <td>2020-05-18 21:15:00</td>\n",
       "      <td>9.590749</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>199996</td>\n",
       "      <td>2020-05-18 21:16:00</td>\n",
       "      <td>9.701630</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>199997</td>\n",
       "      <td>2020-05-18 21:17:00</td>\n",
       "      <td>11.215061</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>199998</td>\n",
       "      <td>2020-05-18 21:18:00</td>\n",
       "      <td>10.103853</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>199999</td>\n",
       "      <td>2020-05-18 21:19:00</td>\n",
       "      <td>9.374185</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>200000 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                      time        cpu\n",
       "0      2020-01-01 00:00:00   9.226157\n",
       "1      2020-01-01 00:01:00  10.342742\n",
       "2      2020-01-01 00:02:00   9.913512\n",
       "3      2020-01-01 00:03:00   9.370216\n",
       "4      2020-01-01 00:04:00  11.107313\n",
       "...                    ...        ...\n",
       "199995 2020-05-18 21:15:00   9.590749\n",
       "199996 2020-05-18 21:16:00   9.701630\n",
       "199997 2020-05-18 21:17:00  11.215061\n",
       "199998 2020-05-18 21:18:00  10.103853\n",
       "199999 2020-05-18 21:19:00   9.374185\n",
       "\n",
       "[200000 rows x 2 columns]"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {\n",
    "    'time':pd.date_range('2020-01-01',freq = 'T',periods = 200000),\n",
    "    'cpu':np.random.randn(200000) + 10\n",
    "}\n",
    "df = pd.DataFrame(data,columns = ['time','cpu'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>time</th>\n",
       "      <th>cpu</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>time</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:00:00</td>\n",
       "      <td>2020-01-01 00:00:00</td>\n",
       "      <td>9.226157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:01:00</td>\n",
       "      <td>2020-01-01 00:01:00</td>\n",
       "      <td>10.342742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:02:00</td>\n",
       "      <td>2020-01-01 00:02:00</td>\n",
       "      <td>9.913512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:03:00</td>\n",
       "      <td>2020-01-01 00:03:00</td>\n",
       "      <td>9.370216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:04:00</td>\n",
       "      <td>2020-01-01 00:04:00</td>\n",
       "      <td>11.107313</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   time        cpu\n",
       "time                                              \n",
       "2020-01-01 00:00:00 2020-01-01 00:00:00   9.226157\n",
       "2020-01-01 00:01:00 2020-01-01 00:01:00  10.342742\n",
       "2020-01-01 00:02:00 2020-01-01 00:02:00   9.913512\n",
       "2020-01-01 00:03:00 2020-01-01 00:03:00   9.370216\n",
       "2020-01-01 00:04:00 2020-01-01 00:04:00  11.107313"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 为了查看一小时内cpu占用情况，现在是一分钟采样，数据太多，改造DataFrame\n",
    "# 让事件作为索引，改造成五分钟一次，求五分钟的平均值\n",
    "s = pd.to_datetime(df.time)\n",
    "df.index = s\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cpu</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>time</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:00:00</td>\n",
       "      <td>9.226157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:01:00</td>\n",
       "      <td>10.342742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:02:00</td>\n",
       "      <td>9.913512</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:03:00</td>\n",
       "      <td>9.370216</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:04:00</td>\n",
       "      <td>11.107313</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           cpu\n",
       "time                          \n",
       "2020-01-01 00:00:00   9.226157\n",
       "2020-01-01 00:01:00  10.342742\n",
       "2020-01-01 00:02:00   9.913512\n",
       "2020-01-01 00:03:00   9.370216\n",
       "2020-01-01 00:04:00  11.107313"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df.drop('time',axis = 1)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cpu</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>time</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:00:00</td>\n",
       "      <td>11.712993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:01:00</td>\n",
       "      <td>9.042127</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:02:00</td>\n",
       "      <td>9.273611</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:03:00</td>\n",
       "      <td>9.878236</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:04:00</td>\n",
       "      <td>9.716202</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:05:00</td>\n",
       "      <td>11.908405</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:06:00</td>\n",
       "      <td>10.222430</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:07:00</td>\n",
       "      <td>8.571608</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:08:00</td>\n",
       "      <td>9.478759</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:09:00</td>\n",
       "      <td>8.728999</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 08:10:00</td>\n",
       "      <td>8.887610</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           cpu\n",
       "time                          \n",
       "2020-04-02 08:00:00  11.712993\n",
       "2020-04-02 08:01:00   9.042127\n",
       "2020-04-02 08:02:00   9.273611\n",
       "2020-04-02 08:03:00   9.878236\n",
       "2020-04-02 08:04:00   9.716202\n",
       "2020-04-02 08:05:00  11.908405\n",
       "2020-04-02 08:06:00  10.222430\n",
       "2020-04-02 08:07:00   8.571608\n",
       "2020-04-02 08:08:00   9.478759\n",
       "2020-04-02 08:09:00   8.728999\n",
       "2020-04-02 08:10:00   8.887610"
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['2020-04-02 08:00:00':'2020-04-02 08:10:00'] # 筛选十分钟的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cpu</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>time</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2020-04-02 00:00:00</td>\n",
       "      <td>10.329608</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 00:01:00</td>\n",
       "      <td>8.683585</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 00:02:00</td>\n",
       "      <td>10.623519</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 00:03:00</td>\n",
       "      <td>8.577942</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 00:04:00</td>\n",
       "      <td>9.838218</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 23:55:00</td>\n",
       "      <td>10.728765</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 23:56:00</td>\n",
       "      <td>10.341683</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 23:57:00</td>\n",
       "      <td>11.258538</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 23:58:00</td>\n",
       "      <td>10.249602</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-04-02 23:59:00</td>\n",
       "      <td>9.736691</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1440 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                           cpu\n",
       "time                          \n",
       "2020-04-02 00:00:00  10.329608\n",
       "2020-04-02 00:01:00   8.683585\n",
       "2020-04-02 00:02:00  10.623519\n",
       "2020-04-02 00:03:00   8.577942\n",
       "2020-04-02 00:04:00   9.838218\n",
       "...                        ...\n",
       "2020-04-02 23:55:00  10.728765\n",
       "2020-04-02 23:56:00  10.341683\n",
       "2020-04-02 23:57:00  11.258538\n",
       "2020-04-02 23:58:00  10.249602\n",
       "2020-04-02 23:59:00   9.736691\n",
       "\n",
       "[1440 rows x 1 columns]"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['2020-04-02'] # 采集一天的数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cpu</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2020-01-01</td>\n",
       "      <td>9.977552</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-02</td>\n",
       "      <td>9.955422</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-03</td>\n",
       "      <td>9.981885</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-04</td>\n",
       "      <td>10.027131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-05</td>\n",
       "      <td>9.981784</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-05-14</td>\n",
       "      <td>9.955085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-05-15</td>\n",
       "      <td>9.953459</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-05-16</td>\n",
       "      <td>10.003765</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-05-17</td>\n",
       "      <td>9.974534</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-05-18</td>\n",
       "      <td>9.991151</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>139 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  cpu\n",
       "2020-01-01   9.977552\n",
       "2020-01-02   9.955422\n",
       "2020-01-03   9.981885\n",
       "2020-01-04  10.027131\n",
       "2020-01-05   9.981784\n",
       "...               ...\n",
       "2020-05-14   9.955085\n",
       "2020-05-15   9.953459\n",
       "2020-05-16  10.003765\n",
       "2020-05-17   9.974534\n",
       "2020-05-18   9.991151\n",
       "\n",
       "[139 rows x 1 columns]"
      ]
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(df.index.date).mean() # 筛选每天cpu占用率的平均值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cpu</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>time</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>9.984755</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>9.996123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>10.002002</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>10.000085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>10.012303</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>9.992394</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>10.016884</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>10.005572</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>9.994232</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>10.003573</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>10.015915</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>10.001448</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>13</td>\n",
       "      <td>10.005086</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14</td>\n",
       "      <td>9.986911</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>15</td>\n",
       "      <td>10.011305</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>16</td>\n",
       "      <td>10.003682</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>17</td>\n",
       "      <td>9.998980</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>18</td>\n",
       "      <td>9.991375</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>19</td>\n",
       "      <td>10.007073</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20</td>\n",
       "      <td>9.973125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>21</td>\n",
       "      <td>9.991151</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            cpu\n",
       "time           \n",
       "1      9.984755\n",
       "2      9.996123\n",
       "3     10.002002\n",
       "4     10.000085\n",
       "5     10.012303\n",
       "6      9.992394\n",
       "7     10.016884\n",
       "8     10.005572\n",
       "9      9.994232\n",
       "10    10.003573\n",
       "11    10.015915\n",
       "12    10.001448\n",
       "13    10.005086\n",
       "14     9.986911\n",
       "15    10.011305\n",
       "16    10.003682\n",
       "17     9.998980\n",
       "18     9.991375\n",
       "19    10.007073\n",
       "20     9.973125\n",
       "21     9.991151"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.groupby(df.index.week).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cpu</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>time</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:00:00</td>\n",
       "      <td>9.991988</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:05:00</td>\n",
       "      <td>10.057814</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:10:00</td>\n",
       "      <td>9.238935</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:15:00</td>\n",
       "      <td>9.934953</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-01-01 00:20:00</td>\n",
       "      <td>9.202381</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-05-18 20:55:00</td>\n",
       "      <td>10.649478</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-05-18 21:00:00</td>\n",
       "      <td>10.060117</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-05-18 21:05:00</td>\n",
       "      <td>9.685733</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-05-18 21:10:00</td>\n",
       "      <td>9.665704</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2020-05-18 21:15:00</td>\n",
       "      <td>9.997096</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>40000 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                           cpu\n",
       "time                          \n",
       "2020-01-01 00:00:00   9.991988\n",
       "2020-01-01 00:05:00  10.057814\n",
       "2020-01-01 00:10:00   9.238935\n",
       "2020-01-01 00:15:00   9.934953\n",
       "2020-01-01 00:20:00   9.202381\n",
       "...                        ...\n",
       "2020-05-18 20:55:00  10.649478\n",
       "2020-05-18 21:00:00  10.060117\n",
       "2020-05-18 21:05:00   9.685733\n",
       "2020-05-18 21:10:00   9.665704\n",
       "2020-05-18 21:15:00   9.997096\n",
       "\n",
       "[40000 rows x 1 columns]"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.resample('5T').mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
